Predicting Delays
1 Introduction
1.1 개요
\[ EDA - Feature Engineering - Modeling - Ensemble - Prediction \]
- EDA(Exploratory Data Analysis)
데이터 구조를 살펴보며 지연에 영향을 미치는 요인을 살펴봅니다.
- Feature Engineering
지연 예측력을 향상시키기 위해 발견한 요인을 유용한 형태의 변수로 가공합니다.
- Modeling
미리 선정한 5개의 학습법 모델에 데이터를 적합시킵니다.
- Ensemble
5개의 학습법 모델을 모두 고려하여 예측 결과를 도출합니다.
- Prediction
정확한 예측을 위해 일괄 예측 대신 날짜별, 기체별로 운항 순서에 따라 한 행씩 예측합니다.
1.2 환경설정
library(rmdformats) # 기본 테마
library(tidyverse) # ggplot2, stringr, dplyr
library(data.table) # Data Loading
library(kernlab) # Supporting Vector Machine
library(xgboost) # XGBOOST
library(randomForest) # Random Forest
library(nnet) # Neural Network
library(Metrics) # MSE 계산
library(caret) # One Hot Encoding
library(forecast) # predict weather
library(tseries) # predict weather
library(lubridate) # time process
library(ggimage) # ggplot image
library(png) #loading png
library(arules) # Association Rule
library(arulesViz) # Association Rule Visualization
library(reshape) # Visualization Tool
library(plotly) # Interactive Visualization
library(gridExtra) # Visualization Tool
library(ROSE) # ROSE samplingAFSNT <- fread("./data/AFSNT.csv",stringsAsFactors=F,header=T); afsnt <- AFSNT
AFSNT_DLY <- fread("./data/AFSNT_DLY.csv",stringsAsFactors=F,header=T); test <- AFSNT_DLY2 EDA
2.1 비행 지연 사유
my_palette<-c("#33A9AC","#6A97F2","#858AF2", "#07028C", "#1C03BF", "#2404F2", "#3F2BBF","#291572","#69607E","#352459", "#343779", "#ADB0D9", "#8B5A8C", "#982062", "#623FB1","#8369C1","#8858F6")
plt_drr<-ggplot(data=afsnt %>%
filter(DLY=="Y") %>%
group_by(DRR) %>%
summarise(N=n()) %>%
arrange (-N)%>%
head(17), aes(x=reorder(DRR,-N),N))+
geom_col(aes(fill=DRR),alpha=0.6) + scale_fill_manual(values=my_palette)+
ggtitle("비행 지연 사유")+
labs(x="지연 사유",y="지연 수")+
theme(legend.title = element_blank(),plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none")
plt_drrC02(A/C 접속)사유로 가장 많이 지연되는 것을 볼 수 있습니다.
즉, A/C 접속 지연 예측이 가장 중요하다고 할 수 있습니다.
2.2 비행 지연 사유 C02 제외
# 전체에서 C02 제외
plt_drr_dC02<-ggplot(data=afsnt %>%
filter(DLY=="Y"&DRR!="C02") %>%
group_by(DRR) %>%
summarise(N=n()) %>%
arrange (-N)%>%
head(17), aes(x=reorder(DRR,-N),N))+
geom_col(aes(fill=DRR),alpha=0.6) + scale_fill_manual(values=my_palette) +
ggtitle("비행 지연 사유 - 'C02' 제외") +
xlab("지연 사유") +
ylab("지연 수") +
theme(plot.title = element_text(size = 20,hjust=0.5) , legend.position = "none")
plt_drr_dC02C02를 제외하고 지연 수를 살펴본 결과,
C01(A/C 정비), A01(안개) 차례로 높은 요인인 것을 확인했습니다.
C01 예측을 위한 기체 변수, A01 예측을 위한 날씨 변수가 필요합니다.
2.3 날씨 요인의 지연 수
plt_weather<-ggplot(data=afsnt %>%
filter(DLY=="Y"&str_detect(DRR,"A")) %>%
group_by(DRR) %>%
summarise(N=n()) %>%
head(10),aes(x=reorder(DRR,-N),y=N))+
geom_col(aes(fill=DRR),alpha=0.6)+
ggtitle("날씨 지연 사유")+
scale_fill_manual(values=my_palette)+
xlab("지연 사유")+
ylab("지연 수")+
theme(plot.title = element_text(size = 20,hjust=0.5),legend.title = element_blank(),
legend.position = "none")
plt_weather날씨 지연 사유만 살펴본 결과 A01(안개), A05(강풍)이 가장 많은 것으로 나타났습니다.
A01 예측을 위한 안개 발생 조건 파악이 필요합니다.
2.4 공항별 지연율
plt_arp<-ggplot(data=afsnt %>%
mutate(DLY=ifelse(DLY=="Y",1,0)) %>%
filter(AOD=="D") %>%
group_by(ARP) %>%
summarise(a=sum(DLY),b=n()) %>%
mutate(p=a/b),aes(x=reorder(ARP,-p),y=p))+
geom_col(aes(fill=ARP),alpha=0.619) + scale_fill_manual(values=my_palette)+
ggtitle("공항별 지연율")+
xlab("공항")+
ylab("지연율")+
theme(plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1))
plt_arp공항마다 지연율이 상이함을 알 수 있습니다.
특히 ARP15(인천), ARP13(군산), ARP3(제주) 세 개 공항의 지연율이 높습니다.
공항마다 지연율이 다르게 나타나는 원인을 파악하기 위해
B(공항 요인)를 자세히 살펴 보았습니다.
2.4.1 공항 요인 지연 수
plt_craft<-ggplot(data=AFSNT %>%
mutate(DLY=ifelse(DLY=="Y",1,0),
DLY_B=ifelse(str_detect(DRR,"B"),1,0)) %>%
group_by(ARP) %>%
summarise(a=sum(DLY_B)) ,aes(x=ARP,y=sort(a, decreasing = T)))+
geom_col(aes(fill=ARP),alpha=0.6)+ scale_fill_manual(values=my_palette)+
ggtitle("공항 사유로 인한 지연 수")+
xlab("공항")+
ylab("지연 수")+
theme(plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none", axis.text.x = element_text(angle=45, hjust = 1))
plt_craftB(공항 요인)로 인한 지연 수를 살펴본 결과,
공항 별로 차이가 크다는 점을 확인했습니다.
즉, 공항 자체가 중요한 변수로 작용할 것입니다.
2.5 시간대별 지연율
## STT를 시간 변수로 바꾸고 지연율 계산하는 함수
arp_tuner_d <- function(df){
afsnt_d <- df %>% filter(AOD=='D')
afsnt_d <- afsnt_d %>% select(STT,DLY,DRR) %>%
mutate(DLY=ifelse(DLY=="Y",1,0),C02=ifelse(DRR=="C02",1,0))
afsnt_d$schedule<-strptime(afsnt_d$STT,"%H:%M")
afsnt_d$timeslot<-afsnt_d$schedule$hour
afsnt_d<-afsnt_d%>%
select(timeslot,DLY,C02)%>%
group_by(timeslot)%>%
summarise(n=n(),DLY_sum=sum(DLY),C02_sum=sum(C02))%>%
mutate(prop_dly=DLY_sum/n,prop_C02=C02_sum/n)
return(afsnt_d)
}
image<-readPNG("./image/aircraft.png")
plt_time_prop <- ggplot(data=arp_tuner_d(afsnt) %>%
mutate(image="./image/aircraft.png"),aes(x=timeslot,y=prop_dly)) +
geom_image(aes(image=image),size=0.07)+geom_line(col="#33A9AC",lwd=0.3)+
xlim(6,22)+
ylim(0,0.4)+
ggtitle("시간대별 지연율")+
xlab("시간대 / 24h")+
ylab("지연율")+
theme(plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none", axis.text.x = element_text(angle=45, hjust = 1))
plt_time_propplt_time_C02<-ggplot(data=arp_tuner_d(afsnt) %>% filter(timeslot>=6 & timeslot<=22),aes(x=timeslot,y=prop_dly,group=1))+
geom_col(aes(x=as.factor(timeslot),y=prop_dly),alpha=0.619)+
geom_col(aes(x=as.factor(timeslot),y=prop_C02,fill="red"))+
ggtitle("시간대 별 C02의 지연율")+
labs(x="시간대",y="C02 지연율", fill="C02 지연율\n", colour="C02")+
scale_color_hue(labels="C02" ) +
theme(plot.title = element_text(size = 20,hjust=0.5))
plt_time_C02 + scale_fill_discrete(labels="C02")오전보다는 오후에 지연이 더 잦은 것을 확인할 수 있습니다.
시간대에 따라 지연 여부가 영향을 받을 것이라고 판단했습니다.
3 Feature Engineering
3.1 외부 데이터 불러오기
3.1.1 weather_arp.csv
항공 기상청에서 가져온 공항의 날씨 데이터입니다.
3.1.1.1 변수 설명
PS_AVG : 평균 해면 기압(hpa)
TMP_AVG, TMP_MAX, TMP_MNM : 평균/ 최대/ 최저 기온 (ºC)
TD_AVG : 평균 이슬점(ºC)
HM_AVG, HM_MNM : 평균/ 최저 상대 습도(%)
WSPD_AVG, WSPD_MAX, WSPD_INS : 평균/ 최대/ 최대순간 풍속 (knot)
CA_TOT_AVG, CLA : 평균/ 최저운고 운량(1/8)
RN_SUM : 강수량(mm)
VIS_MNM : 최단 시정(10m)
### ARP15(incheon)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP15/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKSI.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP15",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP15/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKSI.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP15",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP15/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKSI.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP15",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp15<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
### ARP1(gimpo)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP1/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKSS.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP1",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP1/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKSS.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP1",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP1/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKSS.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP1",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp1<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
###ARP3(jeju)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP3/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKPC.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP3",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP3/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKPC.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP3",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP3/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKPC.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP3",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp3<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
### ARP7(muan)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP7/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKJB.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP7",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP7/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKJB.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP7",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP7/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKJB.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP7",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp7<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
###ARP5(ulsan)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP5/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKPU.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP5",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP5/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKPU.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP5",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP5/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKPU.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP5",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),0,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp5<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
###ARP9(yusu)
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP9/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKJY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP9",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP9/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKJY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP9",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP9/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKJY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP9",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp9<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
### ARP10 yang
dat_list_2017<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP10/"
dat_list_2017[[i]]<-read.csv(paste0(wd,"MONTH_2017",i,"_RKNY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2017[[i]]<-cbind(2017,i,"ARP10",dat_list_2017[[i]])
colnames(dat_list_2017[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2017[[i]]$RN_SUM<-ifelse(is.na(dat_list_2017[[i]]$RN_SUM),0,dat_list_2017[[i]]$RN_SUM)
dat_list_2017[[i]]$SD_MAX<-ifelse(is.na(dat_list_2017[[i]]$SD_MAX),0,dat_list_2017[[i]]$SD_MAX)
dat_list_2017[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2017[[i]]$VIS_MNM),1000,dat_list_2017[[i]]$VIS_MNM)
dat_list_2017[[i]]$CLA<-ifelse(is.na(dat_list_2017[[i]]$CLA),0,dat_list_2017[[i]]$CLA)
dat_list_2017[[i]]$BASE<-ifelse(is.na(dat_list_2017[[i]]$BASE),0,dat_list_2017[[i]]$BASE)
}
dat_list_2018<-list()
for(i in 1:12){
wd<-"./data/airweather/ARP10/"
dat_list_2018[[i]]<-read.csv(paste0(wd,"MONTH_2018",i,"_RKNY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2018[[i]]<-cbind(2018,i,"ARP10",dat_list_2018[[i]])
colnames(dat_list_2018[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2018[[i]]$RN_SUM<-ifelse(is.na(dat_list_2018[[i]]$RN_SUM),0,dat_list_2018[[i]]$RN_SUM)
dat_list_2018[[i]]$SD_MAX<-ifelse(is.na(dat_list_2018[[i]]$SD_MAX),0,dat_list_2018[[i]]$SD_MAX)
dat_list_2018[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2018[[i]]$VIS_MNM),1000,dat_list_2018[[i]]$VIS_MNM)
dat_list_2018[[i]]$CLA<-ifelse(is.na(dat_list_2018[[i]]$CLA),0,dat_list_2018[[i]]$CLA)
dat_list_2018[[i]]$BASE<-ifelse(is.na(dat_list_2018[[i]]$BASE),0,dat_list_2018[[i]]$BASE)
}
dat_list_2019<-list()
for(i in 1:6){
wd<-"./data/airweather/ARP10/"
dat_list_2019[[i]]<-read.csv(paste0(wd,"MONTH_20190",i,"_RKNY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_2019[[i]]<-cbind(2019,i,"ARP10",dat_list_2019[[i]])
colnames(dat_list_2019[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_2019[[i]]$RN_SUM<-ifelse(is.na(dat_list_2019[[i]]$RN_SUM),0,dat_list_2019[[i]]$RN_SUM)
dat_list_2019[[i]]$SD_MAX<-ifelse(is.na(dat_list_2019[[i]]$SD_MAX),0,dat_list_2019[[i]]$SD_MAX)
dat_list_2019[[i]]$VIS_MNM<-ifelse(is.na(dat_list_2019[[i]]$VIS_MNM),1000,dat_list_2019[[i]]$VIS_MNM)
dat_list_2019[[i]]$CLA<-ifelse(is.na(dat_list_2019[[i]]$CLA),0,dat_list_2019[[i]]$CLA)
dat_list_2019[[i]]$BASE<-ifelse(is.na(dat_list_2019[[i]]$BASE),0,dat_list_2019[[i]]$BASE)
}
weather_arp10<-rbind(dat_list_2017[[1]],dat_list_2017[[2]],dat_list_2017[[3]],dat_list_2017[[4]],dat_list_2017[[5]],dat_list_2017[[6]],dat_list_2017[[7]],dat_list_2017[[8]],dat_list_2017[[9]],dat_list_2017[[10]],dat_list_2017[[11]],dat_list_2017[[12]],
dat_list_2018[[1]],dat_list_2018[[2]],dat_list_2018[[3]],dat_list_2018[[4]],dat_list_2018[[5]],dat_list_2018[[6]],dat_list_2018[[7]],dat_list_2018[[8]],dat_list_2018[[9]],dat_list_2018[[10]],dat_list_2018[[11]],dat_list_2018[[12]],
dat_list_2019[[1]],dat_list_2019[[2]],dat_list_2019[[3]],dat_list_2019[[4]],dat_list_2019[[5]],dat_list_2019[[6]])
## export the data
# write.csv(weather_arp1,"weather_arp1.csv",row.names = F)
# write.csv(weather_arp10,"weather_arp10.csv",row.names = F)
# write.csv(weather_arp15,"weather_arp15.csv",row.names = F)
# write.csv(weather_arp3,"weather_arp3.csv",row.names = F)
# write.csv(weather_arp5,"weather_arp5.csv",row.names = F)
# write.csv(weather_arp7,"weather_arp7.csv",row.names = F)
# write.csv(weather_arp9,"weather_arp9.csv",row.names = F)
weather_arp<-rbind(weather_arp1,weather_arp3,weather_arp5,weather_arp7,weather_arp9,weather_arp10,weather_arp15) %>%
mutate(MIST=TMP_MNM-TD_AVG,HM_INV=1/HM_AVG) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS,TMP_AVG,TMP_MAX,TMP_MNM,TD_AVG,HM_AVG,HM_MNM))
# write.csv(weather_arp,"weather_arp.csv",row.names=F)3.1.2 airport data.csv
인천공항은 서울지방항공청, 그 외 공항은 한국공항공사 공공데이터에서 가져온 데이터입니다
3.1.2.1 변수설명
APRON_SIZE : 계류장 면적
CRAFT_STAND : 최대 동시 주기 수
parking_17 <- read.csv("./data/airport/parking_17.csv",stringsAsFactors=F)
parking_18 <- read.csv("./data/airport/parking_18.csv",stringsAsFactors=F)
parking_19 <- read.csv("./data/airport/parking_19.csv",stringsAsFactors=F)
icn <- c('ARP15',2437000,171)
parking_tuner <- function(data, year){
data <- data %>% select(c(1,2,3))
colnames(data) <- c('ARP','size','craft')
data$ARP <- str_trim(data$ARP)
data$ARP <- ifelse(data$ARP=='김포','ARP1',data$ARP)
data$ARP <- ifelse(data$ARP=='김해','ARP2',data$ARP)
data$ARP <- ifelse(data$ARP=='제주','ARP3',data$ARP)
data$ARP <- ifelse(data$ARP=='대구','ARP4',data$ARP)
data$ARP <- ifelse(data$ARP=='울산','ARP5',data$ARP)
data$ARP <- ifelse(data$ARP=='청주','ARP6',data$ARP)
data$ARP <- ifelse(data$ARP=='무안','ARP7',data$ARP)
data$ARP <- ifelse(data$ARP=='광주','ARP8',data$ARP)
data$ARP <- ifelse(data$ARP=='여수','ARP9',data$ARP)
data$ARP <- ifelse(data$ARP=='양양','ARP10',data$ARP)
data$ARP <- ifelse(data$ARP=='포항','ARP11',data$ARP)
data$ARP <- ifelse(data$ARP=='사천','ARP12',data$ARP)
data$ARP <- ifelse(data$ARP=='군산','ARP13',data$ARP)
data$ARP <- ifelse(data$ARP=='원주','ARP14',data$ARP)
data$size <- as.integer(gsub(',','',data$size))
data$craft <- sapply(data$craft, function(x) as.integer(str_split(x, pattern='\\(')[[1]][1]))
data <- rbind(data, icn)
data$SDT_YY <- year
data <- data %>% select(SDT_YY, ARP, size, craft)
return(data)
}
airport <- rbind(parking_tuner(parking_17, 2017),
parking_tuner(parking_18, 2018),
parking_tuner(parking_19, 2019))
airport$size <- as.integer(airport$size)
airport$craft <- as.integer(airport$craft)
colnames(airport) <- c('SDT_YY','ARP','APRON_SIZE','CRAFT_STAND')
head(airport,5) SDT_YY ARP APRON_SIZE CRAFT_STAND
1 2017 ARP1 1215487 152
2 2017 ARP2 389358 41
3 2017 ARP3 403994 36
4 2017 ARP4 43982 7
5 2017 ARP5 33480 6
# write.csv(airport,'airport data.csv',row.names=F)3.2 항공기 일정 파악하기
# 시간 변수 정리하기
afsnt <- afsnt %>% mutate(DATE=as_date(paste(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
SCT=as.integer(gsub(':','',STT)), # 순서를 매기기 위해
STT2=hm(STT), # 시-분 형태로 표현
ATT2=hm(ATT), # 시-분 형태로 표현
DIF=ATT2-STT2) # 지연된 시간
afsnt$DIF <- afsnt$DIF@hour*60 + afsnt$DIF@minute # 분으로 표현
afsnt <- afsnt %>% select(-c(STT2, ATT2))
###afsnt DLY 시간 날짜 정리
AFSNT_DLY <- AFSNT_DLY %>% mutate(DATE=as_date(paste(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
SCT=as.integer(gsub(':','',STT)) )3.2.1 Association Analysis - 연관 분석
각각의 REG가 운항하는 FLT가 서로 어떤 연관성을 가지는 지 파악하기 위해 연관분석을 실행합니다.
# 예측해야 하는 DLY 와 공통인 항공편만 정리
afsnt_common <- afsnt%>% filter(FLT %in% AFSNT_DLY$FLT) %>% arrange(DATE,FLT,SCT)
REG_FLT <- afsnt_common %>% filter(AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
# 항공사 별로 운항 수가 가장 많은 Top REG 뽑기
A_REG_FLT <- afsnt_common %>% filter(FLO=='A',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
B_REG_FLT <- afsnt_common %>% filter(FLO=='B',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
F_REG_FLT <- afsnt_common %>% filter(FLO=='F',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
H_REG_FLT <- afsnt_common %>% filter(FLO=='H',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
I_REG_FLT <- afsnt_common %>% filter(FLO=='I',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
J_REG_FLT <- afsnt_common %>% filter(FLO=='J',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
L_REG_FLT <- afsnt_common %>% filter(FLO=='L',AOD =='D',DATE>='2019-03-31')%>% select(REG,FLT,DATE)
# REG 중 top 10 만 추리기
A_TOPREG <- A_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
A_TOPREG <- A_TOPREG[1:10,]
B_REG_FLT <- B_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
B_REG_FLT <- B_REG_FLT[1:10,]
F_REG_FLT <- F_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
F_REG_FLT <- F_REG_FLT[1:10,]
H_REG_FLT <- H_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
H_REG_FLT <- H_REG_FLT[1:10,]
I_REG_FLT <- I_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
I_REG_FLT <- I_REG_FLT[1:10,]
J_REG_FLT <- J_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
J_REG_FLT <- J_REG_FLT[1:10,]
L_REG_FLT <- L_REG_FLT %>% group_by(REG) %>% summarise(n=n()) %>%
arrange(desc(n))%>% as.data.frame() %>% select(REG)
L_REG_FLT <- L_REG_FLT[1:10,]3.2.2 항공사별 REG로 연관 분석시행
# REG 별로 연관 분석시행
# REG 별 Shopper = Day // Item = FLT
# A 항공사
A_tran <- REG_FLT %>% filter(REG %in% A_TOPREG) #top 10 reg 있는 것만 선택
A_tran$DATE <- A_tran$DATE %>% as.character.Date()
A_unique_date <- unique(A_tran$DATE)# 날 씨 는 총 92 일
# 빈 리스트 생성 by assign 함수
y<-list(); A_top <-list()
# top10 FLT - 루프로 돌리기
# REG별 &날짜별로 나열 후, 해당 REG가 다니는 FLT 를 세개 씩 묶어 정리한다. -> 서로의 연속성을 살리기위해
for (l in 1:10) {
y<-list()
for (i in seq(length(A_unique_date))){
FLT_COL <- A_tran %>% filter(REG==A_TOPREG[l], DATE== A_unique_date[i]) %>% select(FLT)
FLT_COL <- unlist(FLT_COL) %>% as.vector()
if(length(FLT_COL) != 0){
if(length(FLT_COL) != 1){
if(length(FLT_COL) != 2){
len = length(FLT_COL)-2
for (j in seq(len)){
k = j+2
x <- c(FLT_COL[j:k])
y <- append(y,list(x))
}
}
}
}
}
A_top[[l]] <- y
}
# Transaction data 로 변환
# apiori 함수 사용
# https://m.blog.naver.com/leedk1110/220788082381
####변수에 대한 설명
# https://m.blog.naver.com/PostView.nhn?blogId=gkenq&logNo=10188110816&proxyReferer=https%3A%2F%2Fwww.google.com%2F
A_rule <- list()
for ( i in seq(10)){
A_rule[[i]] <- apriori(A_top[[i]],control = list(verbos=F), parameter = list(support=0.01,minlen=3))
}
# inspect(sort(A_rule[[7]],by='lift'))# 얻어낸 rule 들을 data frame화
# A 항공사 최다 빈도 REG 로 확인 해보기
A_rule_data <- list()
for (i in 1){
A_rule_data[[i]] <- inspect(sort(A_rule[[i]],by='lift')) %>% as.data.frame()
} lhs rhs support confidence lift count
[1] {A1147,A1182} => {A1181} 0.01577287 1.0000000 21.133333 10
[2] {A1181,A1196} => {A1182} 0.01577287 1.0000000 21.133333 10
[3] {A1921,A1995} => {A1982} 0.02365931 1.0000000 20.451613 15
[4] {A1144,A1146} => {A1145} 0.04100946 1.0000000 12.192308 26
[5] {A1982,A1995} => {A1921} 0.02365931 1.0000000 11.122807 15
[6] {A1900,A1982} => {A1921} 0.02523659 1.0000000 11.122807 16
[7] {A1947,A1996} => {A1971} 0.04731861 1.0000000 10.566667 30
[8] {A1148,A1902} => {A1149} 0.04416404 1.0000000 10.566667 28
[9] {A1927,A1993} => {A1980} 0.03312303 1.0000000 10.225806 21
[10] {A1927,A1995} => {A1980} 0.01419558 1.0000000 10.225806 9
[11] {A1124,A1234} => {A1125} 0.05047319 1.0000000 9.906250 32
[12] {A1197,A1921} => {A1900} 0.03943218 0.9615385 8.835006 25
[13] {A1196,A1900} => {A1197} 0.03943218 1.0000000 8.233766 25
[14] {A1144,A1145} => {A1146} 0.04100946 1.0000000 8.128205 26
[15] {A1146,A1181} => {A1147} 0.01577287 1.0000000 8.128205 10
[16] {A1182,A1197} => {A1196} 0.01577287 1.0000000 8.128205 10
[17] {A1145,A1147} => {A1146} 0.04100946 1.0000000 8.128205 26
[18] {A1147,A1197} => {A1196} 0.02523659 1.0000000 8.128205 16
[19] {A1146,A1196} => {A1147} 0.02523659 1.0000000 8.128205 16
[20] {A1148,A1149} => {A1902} 0.04416404 0.9333333 7.044444 28
[21] {A1901,A1927} => {A1904} 0.04574132 1.0000000 7.044444 29
[22] {A1971,A1996} => {A1947} 0.04731861 1.0000000 6.967033 30
[23] {A1944,A1971} => {A1947} 0.04731861 1.0000000 6.967033 30
[24] {A1902,A1925} => {A1920} 0.04416404 1.0000000 6.967033 28
[25] {A1925,A1947} => {A1944} 0.04889590 1.0000000 6.891304 31
[26] {A1920,A1944} => {A1925} 0.04889590 1.0000000 6.891304 31
[27] {A1235,A1904} => {A1901} 0.04574132 0.9354839 6.817204 29
[28] {A1980,A1993} => {A1927} 0.03312303 1.0000000 6.744681 21
[29] {A1980,A1995} => {A1927} 0.01419558 1.0000000 6.744681 9
[30] {A1904,A1980} => {A1927} 0.04731861 1.0000000 6.744681 30
[31] {A1124,A1125} => {A1234} 0.05047319 1.0000000 6.604167 32
[32] {A1149,A1920} => {A1902} 0.04416404 0.8750000 6.604167 28
[33] {A1125,A1235} => {A1234} 0.05047319 1.0000000 6.604167 32
[34] {A1234,A1901} => {A1235} 0.04574132 1.0000000 6.604167 29
3.2.2.1 Association Rule 설명
품목 A 와 B 의 연관성을 나타냅니다.( A 가 선택되었을 때 B 가 선택되는 비율)
- lhs : left hand side (품목 A)
- rhs : right hand side (품목 B)
- Support : 지지도 (품목 A와 B를 포함하는 거래수 / 전체 거래 수)
- Confidence : 신뢰도 (품목 A의 거래 중에서 품목 B를 포함하는 거래 수)
- Lift : 향상도 (연관 계측 능력에 대한 수치)
3.2.3 연관분석 결과에 대한 해석
lhs에 해당하는 FLT 가 있으면 해당 FLT를 운항한 기체는
rhs에 해당하는 FLT를 운항할 것으로 해석 가능합니다.
SFSNT를 통해 기체별 일정의 반복성을 파악한 후,
AFSNT의 REG를 AFSNT_DLY에 적용했습니다.
이것이 적절한 추론인지 재확인하기 위해 연관 분석을 사용했습니다.
3.2.4 A 항공편 연관분석 예시
REG : “SEw3NTk0” 의 Association Rule
# 1st REG
A_rule_data[[1]][1:5,] lhs rhs support confidence lift count
[1] {A1147,A1182} => {A1181} 0.01577287 1 21.13333 10
[2] {A1181,A1196} => {A1182} 0.01577287 1 21.13333 10
[3] {A1921,A1995} => {A1982} 0.02365931 1 20.45161 15
[4] {A1144,A1146} => {A1145} 0.04100946 1 12.19231 26
[5] {A1982,A1995} => {A1921} 0.02365931 1 11.12281 15
루틴을 적용한 AFSNT_DLY 의 FLT 들이 연관성이 있는지를 확인해보면,
모든 FLT 들이 서로 높은 연관성있는 FLT 그룹임을 확인할 수 있습니다.
3.2.4.1 예시
‘A’ 항공사의 REG : “SEw3NTk0”
해당 REG 에 대해서 AFSNT_DLY 에 입힌 REG 의 FLT 들이 합당한지에 대해서 Association Rule 에서 얻은 연관 FLT 와 비교하며 확인합니다.
reg_afsnt_dly <- fread('./data/afsnt_dly_predict.csv',stringsAsFactors=F,header=T)
uni_flt <- reg_afsnt_dly %>% filter(REG=="SEw3NTk0") %>% select(FLT) %>% unique()
flt_1<-c()for(i in 1){
A_rule_data[i][[1]]$lhs<-as.character(A_rule_data[i][[1]]$lhs)
A_rule_data[i][[1]]$rhs<-as.character(A_rule_data[i][[1]]$rhs)
A_rule_data[i][[1]]$lhs<-gsub("\\{","",A_rule_data[i][[1]]$lhs)
A_rule_data[i][[1]]$lhs<-gsub("\\}","",A_rule_data[i][[1]]$lhs)
A_rule_data[i][[1]]$rhs<-gsub("\\{","",A_rule_data[i][[1]]$rhs)
A_rule_data[i][[1]]$rhs<-gsub("\\}","",A_rule_data[i][[1]]$rhs)
A_rule_data[i][[1]]$group<-strsplit(A_rule_data[i][[1]]$lhs,",")
for(j in 1:nrow(A_rule_data[i][[1]])){
A_rule_data[i][[1]]$group[[j]]<-append(A_rule_data[i][[1]]$group[[j]],A_rule_data[i][[1]]$rhs[j])
}
}
flt_1<-c()
for(j in 1:nrow(A_rule_data[1][[1]])){
flt_1<-append(flt_1, A_rule_data[1][[1]]$group[[j]])
}
correct <-c()
for ( i in 1:nrow(uni_flt) ){
correct[i] <- ifelse(uni_flt$FLT[i] %in% flt_1 , 1, 0)
}
# 모든 값이 존재함을 확인 할 수 있다 3.3 MIST & HM_INV
안개가 생긴다면 시정이 짧아진다는 사실에 초점을 두고,
이를 표현할 수 있는 새로운 변수를 만들었습니다.
3.3.1 MIST
안개의 생성 조건 중 하나는 기온이 이슬점보다 낮아지는 것입니다.
\[ VIS\_MNM \propto (TMP\_MNM-TD\_AVG) \rightarrow MIST \]
# MIST & VIS_MNM plot
weather_arp<-fread("./data/weather_arp.csv", stringsAsFactors=F)
plt_mist<-ggplot(data=weather_arp[weather_arp$SDT_MM==9&VIS_MNM!=1000,],aes(x=MIST,y=VIS_MNM))+
geom_point(color="blue")+
geom_smooth(method="lm",col="red",se=F)+
ggtitle("Feature Engineering : MIST")+
theme(plot.title = element_text(size = 20,hjust=0.5))
plt_mist3.3.2 HM_INV
안개의 생성 조건 중 다른 하나는 습도가 높은 것입니다.
\[ VIS\_MNM \propto \frac{1}{HM\_AVG} \rightarrow HM\_INV \]
plt_HM_INV<-ggplot(data=weather_arp[weather_arp$SDT_MM==9&VIS_MNM!=1000,],aes(x=HM_INV,y=VIS_MNM))+
geom_point(color="blue")+
geom_smooth(method="lm",col="red",se=F)+
ggtitle("Feature Engineering : HM_INV")+
theme(plot.title = element_text(size = 20,hjust=0.5))
plt_HM_INV3.4 스케줄 혼잡도
3.4.1 COMPLEXITY
특정 항공기의 STT(예정시간)를 기준으로 1시간 이전부터 예정되어 있는 운항 수를 나타낸 변수입니다.
단위 시간 내에 운항 수가 증가하면 공항이 혼잡해진다는 가정 하에,
이는 비행 지연에 영향을 끼칠 것으로 판단했습니다.
AFSNT <- fread("./data/AFSNT.csv",stringsAsFactors=F,header=T)
afsnt<-AFSNT
arp_tuner <- function(df,i){
afsnt_d <- df %>% filter(AOD=='D' & ARP==paste0('ARP',i))
afsnt_a <- df %>% filter(AOD=='A' & ODP==paste0('ARP',i))
afsnt_arp <- rbind(afsnt_d, afsnt_a)
afsnt_arp <- afsnt_arp %>% select(SDT_YY,SDT_MM,SDT_DD,STT,ARP,ODP,AOD,FLT) %>%
mutate(STTT=paste(SDT_YY,SDT_MM,SDT_DD,STT,sep=':'),
POSIX=as.POSIXct(STTT,format='%Y:%m:%d:%H:%M')) %>% arrange(POSIX)
afsnt_arp$schedule <- strptime(afsnt_arp$STTT,'%Y:%m:%d:%H:%M')
schedule_time <- afsnt_arp$schedule$hour*60 + afsnt_arp$schedule$min
count <- c(1); tmp <- 1; cnt <- 0
for (i in 2:nrow(afsnt_arp)){
if(afsnt_arp$schedule[i]$mday!=afsnt_arp$schedule[i-1]$mday){
tmp <- i; count[i] <- 1
} else{
cnt <- 0
for (j in tmp:i){
if(schedule_time[i] - schedule_time[j] <= 60){
cnt <- cnt+1
}
}
count[i] <- cnt
}
}
afsnt_arp <- cbind(afsnt_arp, count) %>% select(-c('POSIX','STTT','schedule'))
return(afsnt_arp)
}
complexity <- arp_tuner(afsnt,1)
for (i in 2:15){
complexity <- rbind(complexity, arp_tuner(afsnt,i))
}3.5 시간대별 C02 (A/C 접속) 지연율
3.5.1 C02
위에서 A/C 접속 예측의 중요성을 언급했기에,
시간대별, 공항별 A/C 접속 지연율을 변수로 추가했습니다.
AFSNT <- fread("./data/AFSNT.csv",stringsAsFactors = F,header=T)
afsnt<-AFSNT
## AOD가 D일 때
arp_tuner_d <- function(df,i){
afsnt_d <- df %>% filter(AOD=='D' & ARP==paste0('ARP',i))
#afsnt_a <- df %>% filter(AOD=='A' & ARP==paste0('ARP',i))
#afsnt_arp <- rbind(afsnt_d, afsnt_a)
afsnt_d <- afsnt_d %>% select(ARP,STT,DLY,DRR) %>%
mutate(C02=ifelse(DRR=="C02",1,0),
DLY_=ifelse(DLY=="Y",1,0))
afsnt_d$schedule<-strptime(afsnt_d$STT,"%H:%M")
afsnt_d$timeslot<-afsnt_d$schedule$hour
afsnt_d<-afsnt_d%>%
select(timeslot,ARP,DLY_,C02)%>%
group_by(timeslot)%>%
summarise(n=n(),DLY_sum=sum(DLY_),DRR_C02=sum(C02))%>%
mutate(prop_C02=DRR_C02/n,prop_DLY=DLY_sum/n,
ARP=paste0('ARP',i))
return(afsnt_d)
}
D_Prop_C02 <- arp_tuner_d(afsnt,1)
for (i in 2:15){
D_Prop_C02 <- rbind(D_Prop_C02, arp_tuner_d(afsnt,i))
}
# write.csv(D_Prop_C02, 'Proportion_C02_D.csv', row.names=F)
## AOD=A일 때
arp_tuner_a <- function(df,i){
#afsnt_d <- df %>% filter(AOD=='D' & ARP==paste0('ARP',i))
afsnt_a <- df %>% filter(AOD=='A' & ARP==paste0('ARP',i))
#afsnt_arp <- rbind(afsnt_d, afsnt_a)
afsnt_a <- afsnt_a %>% select(ARP,STT,DLY,DRR) %>%
mutate(C02=ifelse(DRR=="C02",1,0),
DLY_=ifelse(DLY=="Y",1,0))
afsnt_a$schedule<-strptime(afsnt_a$STT,"%H:%M")
afsnt_a$timeslot<-afsnt_a$schedule$hour
afsnt_a<-afsnt_a%>%
select(timeslot,ARP,DLY_,C02)%>%
group_by(timeslot)%>%
summarise(n=n(),DLY_sum=sum(DLY_),DRR_C02=sum(C02))%>%
mutate(prop_C02=DRR_C02/n,ARP=paste0('ARP',i))
return(afsnt_a)
}
A_Prop_C02 <- arp_tuner_a(afsnt,1)
for (i in 2:15){
A_Prop_C02 <- rbind(A_Prop_C02, arp_tuner_a(afsnt,i))
}
# write.csv(A_Prop_C02, 'Proportion_C02_A.csv', row.names=F)3.6 REG별 지연 관련 변수
3.6.1 REG RATE
REG별 C02를 제외한 항공 지연율 밀도 분포 그래프를 그린 후,
범위에 따라 범주화를 통해 변수를 만들었습니다.
third <- AFSNT %>% group_by(REG) %>% mutate(ALL=n()) %>%
filter(str_detect(DRR,'C')==T, DRR!='C02') %>%
group_by(REG) %>% mutate(N=n(),RATE=N/ALL)
b <- third %>% select(REG, RATE) %>% unique()
p <- qplot(RATE, data=b, geom='density')+
ggtitle("기체별 불량률 분포")+
labs(x="불량률")+
theme(legend.title = element_blank(),plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none")
ggplotly(p)왼쪽으로 치우친 분포를 따르는 것으로 보아
기체 당 정비 문제가 평균적으로 균일하게 일어나지만, 예외적으로 불량률이 높은 기체가 존재합니다.
불량률의 분포에 따라 REG를 범주화하여 REG_RATE 변수를 생성했습니다.
3.6.2 PAST
같은 REG를 가진 기체에 대해서 앞선 운항이 지연되면 다음 운항들이 연쇄적으로 지연이 되는 것을 확인했습니다.
따라서, 앞선 운항의 지연 여부를 PAST 변수로 생성했습니다.
past_tuner <- function(df){
df <- df %>% mutate(DATE=as.Date(paste0(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
TIME=as.numeric(gsub(':','',STT))) %>%
group_by(DATE, REG) %>% mutate(ORDER=rank(TIME)) %>% arrange(DATE, REG, TIME)
order <- df$ORDER; dly <- df$DLY; reg <- df$REG
n <- nrow(df); empty <- c()
for (i in 1:n){
if (order[i]==1|reg[i]==''){
empty <- empty %>% append(0)
}
else if (order[i]-1==order[i-1]){
if (dly[i-1]=='Y'){
empty <- empty %>% append(1)
}
else{
empty <- empty %>% append(0)
}
}
else {
empty <- empty %>% append(NA)
}
}
output <- cbind(df, PAST=empty) %>% as.data.frame() %>% select(-c(DATE, TIME, ORDER))
return(output)
}4 Modeling
4.1 훈련 데이터
위에서 생성한 여러 변수들을 기존의 AFSNT.csv에 붙여서 모델링을 위한 준비 작업을 했습니다.
4.1.0.1 변수 설명
- COMPLEXITY : 공항 혼잡도
- APRON_SIZE : 계류장 면적
- CRAFT_STAND : 최대 동시 주기 수
- C02 : 시간별 A/C 접속 지연 비율
- WEATHER : 날씨 관련 변수들
- ARP / ODP : 기준 공항 / 상대 공항
- FLO : 항공사
- SDT_DY : 요일
- REG_RATE : REG 별로
- PAST : 이전 스켸줄의 지연 여부
plt_arp<-ggplot(data=AFSNT %>%
mutate(DLY=ifelse(DLY=="Y",1,0),
DLY_A=ifelse(str_detect(DRR,"A"),1,0)) %>%
group_by(SDT_MM) %>%
summarise(a=sum(DLY_A),b=n()) %>%
mutate(p=a/b),aes(x=as.factor(SDT_MM),y=p))+
geom_col(aes(fill=SDT_MM),alpha=0.6)+
ggtitle("월별 날씨 지연율")+
xlab("월")+
ylab("지연율")+
theme(plot.title = element_text(size = 20,hjust=0.5),legend.title = element_blank(),
legend.position = "none")
weather_arp<-fread("./data/weather_arp.csv",header = T,stringsAsFactors = F)
plt_weather<-ggplot(data=weather_arp %>%
group_by(SDT_MM) %>%
summarise(a=mean(RN_SUM)),aes(x=as.factor(SDT_MM),y=a))+
geom_col(aes(fill=SDT_MM),alpha=0.6)+
ggtitle("월별 평균 강우량")+
xlab("월")+
ylab("평균 강우량")+
theme(legend.title = element_blank(),plot.title = element_text(size = 20,hjust=0.5),
legend.position = "none")
gridExtra::grid.arrange(plt_arp,plt_weather,nrow=1,ncol=2)최종적으로 예측해야 할 시기는 9월 16일부터 9월 30일까지이므로 정확도를 위해
강우량이 비슷한 9월과 10월의 데이터만 이용하여 train_model.csv 파일을 만들었습니다.
AFSNT <- fread("./data/AFSNT.csv",stringsAsFactors = F,header=T);afsnt<-AFSNT;
weather_arp<-fread("./data/weather_arp.csv",header = T)
prop_C02_A<-fread("./data/Proportion_C02_A.csv",header = T,stringsAsFactors = F)
prop_C02_D<-fread("./data/Proportion_C02_D.csv",header = T,stringsAsFactors = F)
airport <- fread("./data/airport_data.csv",header = T,stringsAsFactors = F)
## Schedule Complexity
arp_tuner <- function(df,i){
afsnt_d <- df %>% filter(AOD=='D' & ARP==paste0('ARP',i))
afsnt_a <- df %>% filter(AOD=='A' & ARP==paste0('ARP',i))
afsnt_arp <- rbind(afsnt_d, afsnt_a)
afsnt_arp <- afsnt_arp %>% select(SDT_YY,SDT_MM,SDT_DD,STT,ARP,ODP,AOD,FLT) %>%
mutate(STTT=paste(SDT_YY,SDT_MM,SDT_DD,STT,sep=':'),
POSIX=as.POSIXct(STTT,format='%Y:%m:%d:%H:%M')) %>% arrange(POSIX)
afsnt_arp$schedule <- strptime(afsnt_arp$STTT,'%Y:%m:%d:%H:%M')
schedule_time <- afsnt_arp$schedule$hour*60 + afsnt_arp$schedule$min
COMPLEXITY <- c(1); tmp <- 1; cnt <- 0
for (i in 2:nrow(afsnt_arp)){
if(afsnt_arp$schedule[i]$mday!=afsnt_arp$schedule[i-1]$mday){
tmp <- i; COMPLEXITY[i] <- 1
} else{
cnt <- 0
for (j in tmp:i){
if(schedule_time[i] - schedule_time[j] <= 60){
cnt <- cnt+1
}
}
COMPLEXITY[i] <- cnt
}
}
afsnt_arp <- cbind(afsnt_arp, COMPLEXITY) %>% select(-c('POSIX','STTT','schedule'))
return(afsnt_arp)
}
COMPLEXITY <- arp_tuner(afsnt,1)
for (i in 2:15){
COMPLEXITY <- rbind(COMPLEXITY, arp_tuner(afsnt,i))
}
afsnt<-left_join(afsnt,COMPLEXITY,by=c("SDT_YY","SDT_MM","SDT_DD","STT","ARP","ODP","AOD","FLT"))
## airport data
afsnt<-left_join(afsnt,airport,by=c("ARP","SDT_YY"))
## devide AOD = A or D
afsnt_D<-afsnt %>%
filter(AOD=="D")
afsnt_A<-afsnt %>%
filter(AOD=="A")
## prop_B,C,D if AOD=A
afsnt_A$schedule<-strptime(afsnt_A$STT,"%H:%M")
afsnt_A$timeslot<-afsnt_A$schedule$hour
afsnt_A<-left_join(afsnt_A %>% select(-schedule),prop_C02_A,by=c("ARP","timeslot")) %>%
select(-c(timeslot,n,DLY_sum,DRR_C02)) %>%
dplyr::rename(C02=prop_C02)
## prop_B,C,D if AOD=D
afsnt_D$schedule<-strptime(afsnt_D$STT,"%H:%M")
afsnt_D$timeslot<-afsnt_D$schedule$hour
afsnt_D<-left_join(afsnt_D %>% select(-schedule),prop_C02_D,by=c("ARP","timeslot")) %>%
select(-c(timeslot,n,DLY_sum,DRR_C02,prop_DLY)) %>%
dplyr::rename(C02=prop_C02)
## weather data
afsnt_D$ARP_fake<-ifelse(afsnt_D$ARP %in% c("ARP2","ARP4","ARP11"),"ARP5",
ifelse(afsnt_D$ARP %in% c("ARP8","ARP12"),"ARP9",
ifelse(afsnt_D$ARP=="ARP6","ARP15",
ifelse(afsnt_D$ARP=="ARP13","ARP7",
ifelse(afsnt_D$ARP=="ARP14","ARP10",afsnt_D$ARP)))))
afsnt_A$ODP_fake<-ifelse(afsnt_A$ODP %in% c("ARP2","ARP4","ARP11"),"ARP5",
ifelse(afsnt_A$ODP %in% c("ARP8","ARP12"),"ARP9",
ifelse(afsnt_A$ODP=="ARP6","ARP15",
ifelse(afsnt_A$ODP=="ARP13","ARP7",
ifelse(afsnt_A$ODP=="ARP14","ARP10",afsnt_A$ODP)))))
afsnt_D<-left_join(afsnt_D,weather_arp,by=c("SDT_YY"="SDT_YY","SDT_MM"="SDT_MM",
"ARP_fake"="ARP","SDT_DD"="DATE")) %>%
select(-ARP_fake)
afsnt_A<-left_join(afsnt_A,weather_arp,by=c("SDT_YY"="SDT_YY","SDT_MM"="SDT_MM",
"ODP_fake"="ARP","SDT_DD"="DATE")) %>%
select(-ODP_fake)
afsnt<-rbind(afsnt_A,afsnt_D) %>%
filter(SDT_MM==9|SDT_MM==10) %>%
mutate(special_fit=ifelse(str_sub(FLT,-1) %in% c("T","M","F"),1,0)) %>%
filter(special_fit==0) %>%
select(-special_fit) %>%
filter(CNL=="N")
arp<-dummyVars(~ARP,data = afsnt,levelsOnly = T)
odp<-dummyVars(~ODP,data=afsnt,levelsOnly = T)
sdt_dy<-dummyVars(~SDT_DY,data=afsnt,levelsOnly = T)
flo<-dummyVars(~FLO,data=afsnt,levelsOnly = T)
afsnt<-cbind(afsnt,predict(arp,newdata=afsnt),predict(odp,newdata=afsnt),predict(sdt_dy,newdata=afsnt),predict(flo,newdata=afsnt))
# Train Data - PAST DELAY 입히기
past_tuner <- function(df){
df <- df %>% mutate(DATE=as.Date(paste0(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
TIME=as.numeric(gsub(':','',STT))) %>%
group_by(DATE, REG) %>% mutate(ORDER=rank(TIME)) %>% arrange(DATE, REG, TIME)
order <- df$ORDER; dly <- df$DLY; reg <- df$REG
n <- nrow(df); empty <- c()
for (i in 1:n){
if (order[i]==1|reg[i]==''){
empty <- empty %>% append(0)
}
else if (order[i]-1==order[i-1]){
if (dly[i-1]=='Y'){
empty <- empty %>% append(1)
}
else{
empty <- empty %>% append(0)
}
}
else {
empty <- empty %>% append(NA)
}
}
output <- cbind(df, PAST=empty) %>% as.data.frame() %>% select(-c(DATE, TIME, ORDER))
return(output)
}
afsnt<-past_tuner(afsnt)
afsnt$PAST<-ifelse(is.na(afsnt$PAST),0,afsnt$PAST)
third <- AFSNT %>% group_by(REG) %>% mutate(ALL=n()) %>%
filter(str_detect(DRR,'C')==T, DRR!='C02') %>%
group_by(REG) %>% mutate(N=n(),RATE=N/ALL)
b <- third %>% select(REG, RATE) %>% unique()
## 0.02 0.04 0.1 etc
# Left Skewed 된 분포를 띤다. 기체 당 정비 문제가 평균적으로 균일하게 일어나지만, 그 중에 예외로 정비 비율이 높은 기체가 존재한다.
REG1<-b$REG[b$RATE<=0.01]
REG2<-b$REG[b$RATE>0.01 & b$RATE <=0.04]
REG3<-b$REG[b$RATE>0.04 ]
afsnt$REG_RATE<-ifelse(afsnt$REG %in% REG1,"REG1",
ifelse(afsnt$REG %in% REG2,"REG2",
ifelse(afsnt$REG %in% REG3,"REG3",NA)))
reg_rate<-dummyVars(~REG_RATE,data=afsnt,levelsOnly = T)
nreg<-which(is.na(afsnt$REG_RATE))
afsnt<-cbind(afsnt,predict(reg_rate,newdata=afsnt))
for(i in nrow(nreg)){
afsnt$REG_RATE[i]<-0
afsnt$REG_RATE1[i]<-0
afsnt$REG_RATE2[i]<-0
afsnt$REG_RATE3[i]<-0
}
train_tuner <- function(df){
df$INDEX <- 0
df <- df %>% select(COMPLEXITY,APRON_SIZE,CRAFT_STAND,C02,
PS_AVG,WSPD_AVG,WSPD_MAX,WSPD_INS,CA_TOT_AVG,
RN_SUM,VIS_MNM,CLA,BASE,MIST,HM_INV,
ARPARP1,ARPARP2,ARPARP3,ARPARP4,ARPARP5,ARPARP6,
ARPARP7,ARPARP8,ARPARP9,ARPARP11,ARPARP12,
ARPARP13,ARPARP14,ARPARP15,
ODPARP1,ODPARP2,ODPARP3,ODPARP4,ODPARP5,ODPARP6,
ODPARP7,ODPARP8,ODPARP9,ODPARP11,ODPARP12,
ODPARP13,ODPARP14,ODPARP15,
SDT_DY월,SDT_DY화,SDT_DY수,SDT_DY목,SDT_DY금,
SDT_DY토,SDT_DY일,
FLOA,FLOB,FLOF,FLOH,FLOI,FLOJ,FLOL,
PAST,REG_RATEREG1,REG_RATEREG2,REG_RATEREG3,
INDEX,DLY,REG,SDT_DD,STT)
return(df)
}
colnames(train_tuner(afsnt)) [1] "COMPLEXITY" "APRON_SIZE" "CRAFT_STAND" "C02"
[5] "PS_AVG" "WSPD_AVG" "WSPD_MAX" "WSPD_INS"
[9] "CA_TOT_AVG" "RN_SUM" "VIS_MNM" "CLA"
[13] "BASE" "MIST" "HM_INV" "ARPARP1"
[17] "ARPARP2" "ARPARP3" "ARPARP4" "ARPARP5"
[21] "ARPARP6" "ARPARP7" "ARPARP8" "ARPARP9"
[25] "ARPARP11" "ARPARP12" "ARPARP13" "ARPARP14"
[29] "ARPARP15" "ODPARP1" "ODPARP2" "ODPARP3"
[33] "ODPARP4" "ODPARP5" "ODPARP6" "ODPARP7"
[37] "ODPARP8" "ODPARP9" "ODPARP11" "ODPARP12"
[41] "ODPARP13" "ODPARP14" "ODPARP15" "SDT_DY월"
[45] "SDT_DY화" "SDT_DY수" "SDT_DY목" "SDT_DY금"
[49] "SDT_DY토" "SDT_DY일" "FLOA" "FLOB"
[53] "FLOF" "FLOH" "FLOI" "FLOJ"
[57] "FLOL" "PAST" "REG_RATEREG1" "REG_RATEREG2"
[61] "REG_RATEREG3" "INDEX" "DLY" "REG"
[65] "SDT_DD" "STT"
# write.csv(afsnt,"train_model.csv",row.names=F)4.2 테스트 데이터
AFSNT_DLY에 생성된 변수들을 추가해야 합니다.
4.2.1 forecast weather
시계열 분석을 통해 9월 16일부터 9월 30일 까지의 날씨를 예측했습니다.
### ARP15(incheon)
dat_list_sep<-list()
for(i in 1:18){
wd<-"./data/forecast/ARP15/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKSI.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP15",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKSI.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP15",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp15<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp15_ts<-list()
weather_forecast_arp15_arima<-list()
weather_forecast_arp15_pred<-list()
weather_forecast_arp15_predict<-data.frame(ARP="ARP15",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp15_ts[[i]]<-ts(weather_forecast_arp15[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp15_arima[[i]]<-arima(weather_forecast_arp15_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp15_arima[[i]],n.ahead=15)
weather_forecast_arp15_pred[[i]]<-pred$pred
weather_forecast_arp15_predict<-cbind(weather_forecast_arp15_predict,as.numeric(weather_forecast_arp15_pred[[i]]))
}
colnames(weather_forecast_arp15_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
## time seires plot
ts.plot(weather_forecast_arp15_ts[[1]],weather_forecast_arp15_pred[[1]],lty=c(1,3),col="darkslategray3",main="Time-Series plot of PS_AVG")## 최대한 수렴하는 값을 없애려고 16~30일로 제한
### ARP3(jeju)
dat_list_sep<-list()
for(i in 1:18){
wd<-"./data/forecast/ARP3/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKPC.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP3",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKPC.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP3",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp3<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp3_ts<-list()
weather_forecast_arp3_arima<-list()
weather_forecast_arp3_pred<-list()
weather_forecast_arp3_predict<-data.frame(ARP="ARP3",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp3_ts[[i]]<-ts(weather_forecast_arp3[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp3_arima[[i]]<-arima(weather_forecast_arp3_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp3_arima[[i]],n.ahead=15)
weather_forecast_arp3_pred[[i]]<-pred$pred
weather_forecast_arp3_predict<-cbind(weather_forecast_arp3_predict,as.numeric(weather_forecast_arp3_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp3_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
### ARP1(kimpo)
dat_list_sep<-list()
for(i in 1:18){
wd<-"./data/forecast/ARP1/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKSS.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP1",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKSS.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP1",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp1<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp1_ts<-list()
weather_forecast_arp1_arima<-list()
weather_forecast_arp1_pred<-list()
weather_forecast_arp1_predict<-data.frame(ARP="ARP1",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp1_ts[[i]]<-ts(weather_forecast_arp15[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp1_arima[[i]]<-arima(weather_forecast_arp1_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp1_arima[[i]],n.ahead=15)
weather_forecast_arp1_pred[[i]]<-pred$pred
weather_forecast_arp1_predict<-cbind(weather_forecast_arp1_predict,as.numeric(weather_forecast_arp1_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp1_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
### ARP7(muan)
dat_list_sep<-list()
for(i in 8:18){
wd<-"./data/forecast/ARP7/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKJB.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP7",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKJB.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP7",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp7<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp7_ts<-list()
weather_forecast_arp7_arima<-list()
weather_forecast_arp7_pred<-list()
weather_forecast_arp7_predict<-data.frame(ARP="ARP7",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp7_ts[[i]]<-ts(weather_forecast_arp7[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp7_arima[[i]]<-arima(weather_forecast_arp15_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp7_arima[[i]],n.ahead=15)
weather_forecast_arp7_pred[[i]]<-pred$pred
weather_forecast_arp7_predict<-cbind(weather_forecast_arp7_predict,as.numeric(weather_forecast_arp7_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp7_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
### ARP5(ulsan)
dat_list_sep<-list()
for(i in 1:18){
wd<-"./data/forecast/ARP5/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKPU.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP5",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKPU.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP5",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp5<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp5_ts<-list()
weather_forecast_arp5_arima<-list()
weather_forecast_arp5_pred<-list()
weather_forecast_arp5_predict<-data.frame(ARP="ARP5",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp5_ts[[i]]<-ts(weather_forecast_arp5[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp5_arima[[i]]<-arima(weather_forecast_arp5_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp5_arima[[i]],n.ahead=15)
weather_forecast_arp5_pred[[i]]<-pred$pred
weather_forecast_arp5_predict<-cbind(weather_forecast_arp5_predict,as.numeric(weather_forecast_arp5_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp5_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
### ARP10(yang)
dat_list_sep<-list()
for(i in 2:18){
wd<-"./data/forecast/ARP10/"
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKNY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP10",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKNY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP10",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp10<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp10_ts<-list()
weather_forecast_arp10_arima<-list()
weather_forecast_arp10_pred<-list()
weather_forecast_arp10_predict<-data.frame(ARP="ARP10",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp10_ts[[i]]<-ts(weather_forecast_arp10[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp10_arima[[i]]<-arima(weather_forecast_arp10_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp10_arima[[i]],n.ahead=15)
weather_forecast_arp10_pred[[i]]<-pred$pred
weather_forecast_arp10_predict<-cbind(weather_forecast_arp10_predict,as.numeric(weather_forecast_arp10_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp10_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
### ARP9(yusu)
dat_list_sep<-list()
for(i in 1:18){
wd<-c("./data/forecast/ARP9/")
if(i<10){
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_200",i,"09_RKJY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP9",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
} else {
dat_list_sep[[i]]<-read.csv(paste0(wd,"MONTH_20",i,"09_RKJY.csv"),header = T,fileEncoding = "UTF8",stringsAsFactors = F)
dat_list_sep[[i]]<-cbind((2000+i),9,"ARP9",dat_list_sep[[i]])
colnames(dat_list_sep[[i]])[c(1,2,3)]<-c("SDT_YY","SDT_MM","ARP")
dat_list_sep[[i]]$RN_SUM<-ifelse(is.na(dat_list_sep[[i]]$RN_SUM),0,dat_list_sep[[i]]$RN_SUM)
dat_list_sep[[i]]$SD_MAX<-ifelse(is.na(dat_list_sep[[i]]$SD_MAX),0,dat_list_sep[[i]]$SD_MAX)
dat_list_sep[[i]]$VIS_MNM<-ifelse(is.na(dat_list_sep[[i]]$VIS_MNM),1000,dat_list_sep[[i]]$VIS_MNM)
dat_list_sep[[i]]$CLA<-ifelse(is.na(dat_list_sep[[i]]$CLA),0,dat_list_sep[[i]]$CLA)
dat_list_sep[[i]]$BASE<-ifelse(is.na(dat_list_sep[[i]]$BASE),0,dat_list_sep[[i]]$BASE)
}
}
weather_forecast_arp9<-rbind(dat_list_sep[[1]],dat_list_sep[[2]],dat_list_sep[[3]],dat_list_sep[[4]],dat_list_sep[[5]],dat_list_sep[[6]],dat_list_sep[[7]],dat_list_sep[[8]],dat_list_sep[[9]],dat_list_sep[[10]],dat_list_sep[[11]],dat_list_sep[[12]],dat_list_sep[[13]],dat_list_sep[[14]],dat_list_sep[[15]],dat_list_sep[[16]],dat_list_sep[[17]],dat_list_sep[[18]]) %>%
filter(DATE>=16) %>%
select(-c(CLF,SD_MAX,WD_MAX,WD_INS))
weather_forecast_arp9_ts<-list()
weather_forecast_arp9_arima<-list()
weather_forecast_arp9_pred<-list()
weather_forecast_arp9_predict<-data.frame(ARP="ARP9",DATE=c(16:30))
# par(mfrow=c(5,5))
for(i in c("PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")){
weather_forecast_arp9_ts[[i]]<-ts(weather_forecast_arp9[,i],start = c(2001,1),frequency = 15)
weather_forecast_arp9_arima[[i]]<-arima(weather_forecast_arp9_ts[[i]],c(2,0,0))
pred<-predict(weather_forecast_arp9_arima[[i]],n.ahead=15)
weather_forecast_arp9_pred[[i]]<-pred$pred
weather_forecast_arp9_predict<-cbind(weather_forecast_arp9_predict,as.numeric(weather_forecast_arp9_pred[[i]]))
# ts.plot(weather_forecast_arp15_ts[[i]],pred$pred,lty=c(1,3))
}
colnames(weather_forecast_arp9_predict)<-c("ARP","DATE","PS_AVG","TMP_AVG","TMP_MAX","TMP_MNM","TD_AVG","HM_AVG","HM_MNM","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE")
## export the data
# write.csv(weather_forecast_arp1,"weather_forecast_arp1.csv",row.names = F)
# write.csv(weather_forecast_arp3,"weather_forecast_arp3.csv",row.names = F)
# write.csv(weather_forecast_arp5,"weather_forecast_arp5.csv",row.names = F)
# write.csv(weather_forecast_arp7,"weather_forecast_arp7.csv",row.names = F)
# write.csv(weather_forecast_arp9,"weather_forecast_arp9.csv",row.names = F)
# write.csv(weather_forecast_arp10,"weather_forecast_arp10.csv",row.names = F)
# write.csv(weather_forecast_arp15,"weather_forecast_arp15.csv",row.names = F)
weather_forecast_predict<-rbind(weather_forecast_arp1_predict,weather_forecast_arp3_predict,weather_forecast_arp5_predict,weather_forecast_arp7_predict,weather_forecast_arp9_predict,weather_forecast_arp10_predict,weather_forecast_arp15_predict) %>%
mutate(MIST=TMP_MNM-TD_AVG,HM_INV=1/HM_AVG) %>%
select(-c(TMP_AVG,TMP_MAX,TMP_MNM,TD_AVG,HM_AVG,HM_MNM))
colnames(weather_forecast_predict)<-c("ARP","DATE","PS_AVG","WSPD_AVG","WSPD_MAX","WSPD_INS","CA_TOT_AVG","RN_SUM","VIS_MNM","CLA","BASE","MIST","HM_INV")
# write.csv(weather_forecast_predict,"weather_forecast_predict.csv",row.names=F)AFSNT<-fread("./data/AFSNT.csv",header = T,stringsAsFactors = F); afsnt<-AFSNT
AFSNT_DLY<-fread("./data/AFSNT_DLY.csv",header = T,stringsAsFactors = F)
AFSNT_DLY[3,5]<-"ARP1"
weather_forecast<-fread("./data/weather_forecast_predict.csv",header = T,stringsAsFactors = F)
prop_C02_A<-fread("./data/Proportion_C02_A.csv",header = T,stringsAsFactors = F)
prop_C02_D<-fread("./data/Proportion_C02_D.csv",header = T,stringsAsFactors = F)
airport <- fread("./data/airport_data.csv",header = T,stringsAsFactors = F)
afsnt <- afsnt %>% mutate(DATE=as_date(paste(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
SCT=as.integer(gsub(':','',STT)), # 순서를 매기기 위해
STT2=hm(STT), # 시-분 형태로 표현
ATT2=hm(ATT), # 시-분 형태로 표현
DIF=ATT2-STT2) # 지연된 시간
afsnt$DIF <- afsnt$DIF@hour*60 + afsnt$DIF@minute # 분으로 표현
afsnt <- afsnt %>% select(-c(STT2, ATT2))
###afsnt DLY 시간 날짜 정리
AFSNT_DLY <- AFSNT_DLY %>% mutate(DATE=as_date(paste(SDT_YY,'-',SDT_MM,'-',SDT_DD)),
SCT=as.integer(gsub(':','',STT)),
INDEX=1:nrow(AFSNT_DLY))
#######################
# SFSNT 와 AFSNT 를 통한 REG 의 루틴을 Test data 에 적용
#######train data REG - Test data 에 입히기
#최신 train data 일정 뽑기 - 2019년 6월 마지막주의 FLT를 통하여..
AFSNT_DLY$DY_FLT <- paste0(AFSNT_DLY$SDT_DY,AFSNT_DLY$FLT,AFSNT_DLY$AOD)
latest_afsnt <- afsnt %>% filter(DATE>='2019-06-24')
latest_afsnt$DY_FLT <- paste0(latest_afsnt$SDT_DY,latest_afsnt$FLT,latest_afsnt$AOD)
latest_afsnt <- latest_afsnt %>% select(DY_FLT,REG)
#AFSNT 와 최신 일정 을 JOIN 한 , AFSNT_DLY_join 이라는 새로운 데이터 프레임 생성
AFSNT_DLY_join <- left_join(AFSNT_DLY,latest_afsnt, by= 'DY_FLT') %>% arrange(INDEX)
AFSNT_DLY_join$REG[is.na(AFSNT_DLY_join$REG)] <- ""
#### (최신 항공편) 6월 마지막 주에 없는 항공편 채워 넣기
# REG가 정해지지 않은 항공편만 뽑아내기
NAREG <- AFSNT_DLY_join %>% filter(AFSNT_DLY_join$REG=="")
NAREG <- NAREG[,1:16] # reg 컴럼 삭제
#########
#REG가 정해지지 않은 FLT들만 뽑아 내어 6월 셋쨰 주 운항 일정 입히기
#셋쨰 주 선택
W3_afsnt <- afsnt %>% filter(DATE>='2019-06-16',DATE<='2019-06-22')
W3_afsnt$DY_FLT <- paste0(W3_afsnt$SDT_DY,W3_afsnt$FLT,W3_afsnt$AOD)
W3_afsnt <- W3_afsnt %>% select(REG, DY_FLT)
#NA REG와 합치기
NA_REG_join <- left_join(NAREG,W3_afsnt, by= 'DY_FLT') %>% arrange(INDEX)
NA_REG_join$DATE_FLT_AOD <- paste0(NA_REG_join$DATE,NA_REG_join$FLT,NA_REG_join$AOD)
NA_REG_join <-NA_REG_join %>% select(DATE_FLT_AOD,REG)
AFSNT_DLY_join$DATE_FLT_AOD <- paste0(AFSNT_DLY_join$DATE,AFSNT_DLY_join$FLT,AFSNT_DLY_join$AOD)
# 셋쨰주 운항 일정으로 새롭게 찾은 정보들을 원본파일에 합치기
AFSNT_DLY_join <- left_join(AFSNT_DLY_join,NA_REG_join,by='DATE_FLT_AOD')
AFSNT_DLY_join$REG.x[is.na(AFSNT_DLY_join$REG.x)] <- ""
AFSNT_DLY_join$REG.y[is.na(AFSNT_DLY_join$REG.y)] <- ""
AFSNT_DLY_join$REG.x<-ifelse(AFSNT_DLY_join$REG.x=="",AFSNT_DLY_join$REG.y,AFSNT_DLY_join$REG.x)
AFSNT_DLY_join<- AFSNT_DLY_join[,-c(18,19)] # reg y & 불필요 칼럼 삭제
names(AFSNT_DLY_join)[names(AFSNT_DLY_join) == "REG.x"] <- c("REG")
###########
###### 같은 FLT 에 다수의 REG 가 있는 경우는 확실한 정보가 아니므로 공백 처리
AFSNT_DLY_join$DATE_DY_FLT <- paste0(AFSNT_DLY_join$DATE,AFSNT_DLY_join$DY_FLT)
AFSNT_DLY_join$idx <- 1:nrow(AFSNT_DLY_join)
dup_del <-AFSNT_DLY_join[-which(duplicated(AFSNT_DLY_join$DATE_DY_FLT)),]
anti_del <- anti_join(AFSNT_DLY_join,dup_del, by = 'idx')
anti_del_idx <- anti_join(AFSNT_DLY_join,dup_del, by = 'idx') %>% select(idx)
dup_del$REG <- ifelse(dup_del$idx %in% anti_del_idx, dup_del$REG=="",dup_del$REG)
dup_del<-dup_del %>% select(-c(DY_FLT,DATE_DY_FLT,idx,DATE,SCT))
afsnt_dly<-dup_del
## complex
arp_tuner <- function(df,i){
afsnt_d <- df %>% filter(AOD=='D' & ARP==paste0('ARP',i))
afsnt_a <- df %>% filter(AOD=='A' & ARP==paste0('ARP',i))
afsnt_arp <- rbind(afsnt_d, afsnt_a)
afsnt_arp <- afsnt_arp %>% select(SDT_YY,SDT_MM,SDT_DD,STT,ARP,ODP,AOD,FLT) %>%
mutate(STTT=paste(SDT_YY,SDT_MM,SDT_DD,STT,sep=':'),
POSIX=as.POSIXct(STTT,format='%Y:%m:%d:%H:%M')) %>% arrange(POSIX)
afsnt_arp$schedule <- strptime(afsnt_arp$STTT,'%Y:%m:%d:%H:%M')
schedule_time <- afsnt_arp$schedule$hour*60 + afsnt_arp$schedule$min
COMPLEXITY <- c(1); tmp <- 1; cnt <- 0
for (i in 2:nrow(afsnt_arp)){
if(afsnt_arp$schedule[i]$mday!=afsnt_arp$schedule[i-1]$mday){
tmp <- i; COMPLEXITY[i] <- 1
} else{
cnt <- 0
for (j in tmp:i){
if(schedule_time[i] - schedule_time[j] <= 60){
cnt <- cnt+1
}
}
COMPLEXITY[i] <- cnt
}
}
afsnt_arp <- cbind(afsnt_arp, COMPLEXITY) %>% select(-c('POSIX','STTT','schedule'))
return(afsnt_arp)
}
COMPLEXITY <- arp_tuner(afsnt_dly,1)
for (i in 2:15){
COMPLEXITY <- rbind(COMPLEXITY, arp_tuner(afsnt_dly,i))
}
afsnt_dly<-left_join(afsnt_dly,COMPLEXITY,by=c("SDT_YY","SDT_MM","SDT_DD","STT","ARP","ODP","AOD","FLT"))
## airport data
afsnt_dly<-left_join(afsnt_dly,airport,by=c("ARP","SDT_YY"))
## devide AOD = A or D
afsnt_dly_D<-afsnt_dly %>% filter(AOD=="D")
afsnt_dly_A<-afsnt_dly %>% filter(AOD=="A")
## prop C02, AOD=A
afsnt_dly_A$schedule<-strptime(afsnt_dly_A$STT,"%H:%M")
afsnt_dly_A$timeslot<-afsnt_dly_A$schedule$hour
afsnt_dly_A<-left_join(afsnt_dly_A %>% select(-schedule), prop_C02_A ,by=c("ARP","timeslot"))
afsnt_dly_A <- afsnt_dly_A %>% select(-c(timeslot,n,DLY_sum,DRR_C02)) %>%dplyr::rename(C02=prop_C02)
afsnt_dly_A$C02<-ifelse(is.na(afsnt_dly_A$C02),0,afsnt_dly_A$C02)
## prop C02, AOD=D
afsnt_dly_D$schedule<-strptime(afsnt_dly_D$STT,"%H:%M")
afsnt_dly_D$timeslot<-afsnt_dly_D$schedule$hour
afsnt_dly_D<-left_join(afsnt_dly_D %>% select(-schedule),prop_C02_D,by=c("ARP","timeslot")) %>%
select(-c(timeslot,n,DLY_sum,DRR_C02,prop_DLY)) %>%
dplyr::rename(C02=prop_C02)
afsnt_dly_D$C02<-ifelse(is.na(afsnt_dly_D$C02),0,afsnt_dly_D$C02)
afsnt_dly_D$ARP_fake <- ifelse(afsnt_dly_D$ARP %in% c("ARP2","ARP4","ARP11"),"ARP5",
ifelse(afsnt_dly_D$ARP %in% c("ARP8","ARP12"),"ARP9",
ifelse(afsnt_dly_D$ARP=="ARP6","ARP15",
ifelse(afsnt_dly_D$ARP=="ARP13","ARP7",
ifelse(afsnt_dly_D$ARP=="ARP14","ARP10",afsnt_dly_D$ARP)))))
afsnt_dly_A$ODP_fake<-ifelse(afsnt_dly_A$ODP %in% c("ARP2","ARP4","ARP11"),"ARP5",
ifelse(afsnt_dly_A$ODP %in% c("ARP8","ARP12"),"ARP9",
ifelse(afsnt_dly_A$ODP=="ARP6","ARP15",
ifelse(afsnt_dly_A$ODP=="ARP13","ARP7",
ifelse(afsnt_dly_A$ODP=="ARP14","ARP10",afsnt_dly_A$ODP)))))
afsnt_dly_D <- left_join(afsnt_dly_D,weather_forecast,by=c("ARP_fake"="ARP","SDT_DD"="DATE")) %>% select(-ARP_fake)
afsnt_dly_A<-left_join(afsnt_dly_A,weather_forecast,by=c("ODP_fake"="ARP","SDT_DD"="DATE")) %>% select(-ODP_fake)
afsnt_dly<-rbind(afsnt_dly_A,afsnt_dly_D)
arp<-dummyVars(~ARP,data = afsnt_dly,levelsOnly = T)
odp<-dummyVars(~ODP,data=afsnt_dly,levelsOnly = T)
sdt_dy<-dummyVars(~SDT_DY,data=afsnt_dly,levelsOnly = T)
flo<-dummyVars(~FLO,data=afsnt_dly,levelsOnly = T)
afsnt_dly<-cbind(afsnt_dly,predict(arp,newdata=afsnt_dly),predict(odp,newdata=afsnt_dly),predict(sdt_dy,newdata=afsnt_dly),predict(flo,newdata=afsnt_dly))
afsnt_dly$PAST<-0
third <- AFSNT %>% group_by(REG) %>% mutate(ALL=n()) %>%
filter(str_detect(DRR,'C')==T, DRR!='C02') %>%
group_by(REG) %>% mutate(N=n(),RATE=N/ALL)
b <- third %>% select(REG, RATE) %>% unique()
## 0.02 0.04 0.1 etc
# Left Skewed 된 분포를 띤다. 기체 당 정비 문제가 평균적으로 균일하게 일어나지만, 그 중에 예외로 정비 비율이 높은 기체가 존재한다.
REG1<-b$REG[b$RATE<=0.01]
REG2<-b$REG[b$RATE>0.01 & b$RATE <=0.04]
REG3<-b$REG[b$RATE>0.04 ]
afsnt_dly$REG_RATE<-ifelse(afsnt_dly$REG %in% REG1,"REG1",
ifelse(afsnt_dly$REG %in% REG2,"REG2",
ifelse(afsnt_dly$REG %in% REG3,"REG3",NA)))
reg_rate<-dummyVars(~REG_RATE,data=afsnt_dly,levelsOnly = T)
afsnt_dly<-cbind(afsnt_dly,predict(reg_rate,newdata=afsnt_dly))
nreg<-which(is.na(afsnt_dly$REG_RATE))
for(i in nreg){
afsnt_dly$REG_RATE[i]<-0
afsnt_dly$REG_RATEREG1[i]<-0
afsnt_dly$REG_RATEREG2[i]<-0
afsnt_dly$REG_RATEREG3[i]<-0
}
afsnt_dly<-afsnt_dly %>%
select(-c(ARPARP10,ODPARP10)) %>%
arrange(INDEX)
test_tuner <- function(df){
df <- df %>% select(COMPLEXITY,APRON_SIZE,CRAFT_STAND,C02,
PS_AVG,WSPD_AVG,WSPD_MAX,WSPD_INS,CA_TOT_AVG,
RN_SUM,VIS_MNM,CLA,BASE,MIST,HM_INV,
ARPARP1,ARPARP2,ARPARP3,ARPARP4,ARPARP5,ARPARP6,
ARPARP7,ARPARP8,ARPARP9,ARPARP11,ARPARP12,
ARPARP13,ARPARP14,ARPARP15,
ODPARP1,ODPARP2,ODPARP3,ODPARP4,ODPARP5,ODPARP6,
ODPARP7,ODPARP8,ODPARP9,ODPARP11,ODPARP12,
ODPARP13,ODPARP14,ODPARP15,
SDT_DY월,SDT_DY화,SDT_DY수,SDT_DY목,SDT_DY금,
SDT_DY토,SDT_DY일,
FLOA,FLOB,FLOF,FLOH,FLOI,FLOJ,FLOL,
PAST,REG_RATEREG1,REG_RATEREG2,REG_RATEREG3,
INDEX,DLY,REG,SDT_DD,STT)
return(df)
}
colnames(test_tuner(afsnt_dly) %>% select(-c(INDEX,DLY,REG,SDT_DD,STT))) [1] "COMPLEXITY" "APRON_SIZE" "CRAFT_STAND" "C02"
[5] "PS_AVG" "WSPD_AVG" "WSPD_MAX" "WSPD_INS"
[9] "CA_TOT_AVG" "RN_SUM" "VIS_MNM" "CLA"
[13] "BASE" "MIST" "HM_INV" "ARPARP1"
[17] "ARPARP2" "ARPARP3" "ARPARP4" "ARPARP5"
[21] "ARPARP6" "ARPARP7" "ARPARP8" "ARPARP9"
[25] "ARPARP11" "ARPARP12" "ARPARP13" "ARPARP14"
[29] "ARPARP15" "ODPARP1" "ODPARP2" "ODPARP3"
[33] "ODPARP4" "ODPARP5" "ODPARP6" "ODPARP7"
[37] "ODPARP8" "ODPARP9" "ODPARP11" "ODPARP12"
[41] "ODPARP13" "ODPARP14" "ODPARP15" "SDT_DY월"
[45] "SDT_DY화" "SDT_DY수" "SDT_DY목" "SDT_DY금"
[49] "SDT_DY토" "SDT_DY일" "FLOA" "FLOB"
[53] "FLOF" "FLOH" "FLOI" "FLOJ"
[57] "FLOL" "PAST" "REG_RATEREG1" "REG_RATEREG2"
[61] "REG_RATEREG3"
# write.csv(afsnt_dly,"afsnt_dly_predict.csv",row.names = F)4.3 부트스트랩
지연 여부가 약 1:7의 불균형 자료이기 때문에 균형 있게 샘플링하기 위해
로즈 기법을 사용했습니다.
4.3.0.1 로즈기법
로즈: ROSE (Random Over-Sampling Examples)
불균형 데이터에서 사용하는 기법으로써
두 그룹에 대해서 조건부 확률 분포를 통해 새로운 샘플들을 생성하는 부트스트래핑 기법입니다.
train <- fread("./data/train_model.csv",header=T,stringsAsFactors=F) %>%
mutate(DLY=ifelse(DLY=="Y",1,0))
test <- fread('./data/afsnt_dly_predict.csv',header=T,stringsAsFactors=F) %>%
mutate(DLY=ifelse(DLY=="Y",1,0))
afsnt_dly <- fread('./data/afsnt_dly.csv',header=T,stringsAsFactors=F) %>% select(-c(DLY, DLY_RATE))
afsnt_dly$INDEX <- 1:nrow(afsnt_dly)
normalize <- function(x){
return((x - min(x))/(max(x)-min(x)))
}
train_tuner <- function(df){
df$INDEX <- 0
df <- df %>% select(COMPLEXITY,APRON_SIZE,CRAFT_STAND,C02,
PS_AVG,WSPD_AVG,WSPD_MAX,WSPD_INS,CA_TOT_AVG,
RN_SUM,VIS_MNM,CLA,BASE,MIST,HM_INV,
ARPARP1,ARPARP2,ARPARP3,ARPARP4,ARPARP5,ARPARP6,
ARPARP7,ARPARP8,ARPARP9,ARPARP11,ARPARP12,
ARPARP13,ARPARP14,ARPARP15,
ODPARP1,ODPARP2,ODPARP3,ODPARP4,ODPARP5,ODPARP6,
ODPARP7,ODPARP8,ODPARP9,ODPARP11,ODPARP12,
ODPARP13,ODPARP14,ODPARP15,
SDT_DY월,SDT_DY화,SDT_DY수,SDT_DY목,SDT_DY금,
SDT_DY토,SDT_DY일,
FLOA,FLOB,FLOF,FLOH,FLOI,FLOJ,FLOL,
PAST,REG_RATEREG1,REG_RATEREG2,REG_RATEREG3,
INDEX,DLY,REG,SDT_DD,STT)
return(df)
}
test_tuner <- function(df){
df <- df %>% select(COMPLEXITY,APRON_SIZE,CRAFT_STAND,C02,
PS_AVG,WSPD_AVG,WSPD_MAX,WSPD_INS,CA_TOT_AVG,
RN_SUM,VIS_MNM,CLA,BASE,MIST,HM_INV,
ARPARP1,ARPARP2,ARPARP3,ARPARP4,ARPARP5,ARPARP6,
ARPARP7,ARPARP8,ARPARP9,ARPARP11,ARPARP12,
ARPARP13,ARPARP14,ARPARP15,
ODPARP1,ODPARP2,ODPARP3,ODPARP4,ODPARP5,ODPARP6,
ODPARP7,ODPARP8,ODPARP9,ODPARP11,ODPARP12,
ODPARP13,ODPARP14,ODPARP15,
SDT_DY월,SDT_DY화,SDT_DY수,SDT_DY목,SDT_DY금,
SDT_DY토,SDT_DY일,
FLOA,FLOB,FLOF,FLOH,FLOI,FLOJ,FLOL,
PAST,REG_RATEREG1,REG_RATEREG2,REG_RATEREG3,
INDEX,DLY,REG,SDT_DD,STT)
return(df)
}
tuned_train <- train_tuner(train)
tuned_test <- test_tuner(test)
tuned_train$label <- 'train'
tuned_test$label <- 'test'
pool <- rbind(tuned_train, tuned_test)
pool_norm <- cbind(as.data.frame(lapply(pool[,1:61], normalize)),
INDEX=pool$INDEX,
DLY=pool$DLY,
REG=pool$REG,
SDT_DD=pool$SDT_DD,
STT=pool$STT,
label=pool$label)
train_norm <- pool_norm %>% filter(label=='train') %>% select(-label)
test_norm <- pool_norm %>% filter(label=='test') %>% select(-label)
train <- train_norm %>% select(-c(INDEX,REG,SDT_DD,STT))
train_temp <- list()
train_actual <- list()
test_temp <- list()
table(train$DLY)
0 1
117940 16333
for (k in 1:5){
train_temp[[k]] <- ovun.sample(DLY~., data=train, N=5000, method="both", p=0.3, seed=k)$data
train_actual[[k]] <- ovun.sample(DLY~., data=train, N=10000, method="both", p=0.3, seed=k)$data
}
for (k in 1:5){
set.seed(k)
idx <- sample(1:nrow(train), 5000*0.3)
test_temp[[k]] <- train[idx,]
}
table(train_temp[[k]]$DLY)
0 1
3487 1513
test_final <- test_norm %>% mutate(STT=as.integer(gsub(':','',STT))) %>% select(-DLY)4.4 파라미터 탐색
4.4.1 Random Forest
Random Forest를 학습하기 전에 앞서, 사용될 하이퍼 파라미터를 알아 봤습니다.
- mtry : 선택할 Feature의 개수 (예측의 경우 변수의 개수/3 적합)
- ntrees : 만들어 낼 나무의 개수
4.4.1.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
vars <- as.integer((length(colnames(train))-1)/3)
c_mtry <- c(vars-1, vars, vars+1)
mtry<-c(); mmse<-c()
for (i in c_mtry){
c_error <- c()
for (k in 1:5){
set.seed(k)
cat(i,k,"\n")
mod_rf <- randomForest(DLY~., data=train_temp[[k]],
mtry=i)
pred <- predict(mod_rf, test_temp[[k]])
error <- mse(test_temp[[k]]$DLY, pred)
c_error <- c_error %>% append(error)
}
mtry <- mtry %>% append(i)
mmse <- mmse %>% append(mean(c_error))
}
table_rf <- data.frame(mtry, mmse) %>% arrange(mmse)
table_rf %>% head(5)4.4.2 Logistic Regression
로지스틱 회귀 분석은 조정해줄 하이퍼 파라미터는 없지만,
변수 선택법을 통해 적합한 모델을 찾았습니다.
4.4.2.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
c_error <- c()
for (k in 1:5){
mod_glm_all <- glm(DLY~.+MIST:HM_INV, data=train_temp[[k]], family='binomial')
mod_glm_0 <- glm(DLY~1, data=train_temp[[k]], family='binomial')
mod_glm <- step(mod_glm_0,
scope=list(lower=mod_glm_0, upper=mod_glm_all),
direction='both', trace=0)
pred <- predict(mod_glm, test_temp[[k]], type='response')
error <- mse(test_temp[[k]]$DLY, pred)
c_error <- c_error %>% append(error)
}
table_glm <- data.frame(parameter=NA, mmse=mean(c_error))
table_glm %>% head(1)4.4.3 Support Vector Machine
SVM을 학습하기 앞서, 사용될 하이퍼 파라미터를 찾아봤습니다.
C : 제약을 위반할 때의 비용
gamma : Gaussian 커널을 조절하기 위한 파라미터
4.4.3.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
c_hyper <- c(0.01, 0.1, 1, 5, 10)
c_gamma <- c(0.1, 0.5, 1)
hyper<-c(); gamma<-c(); mmse<-c()
for(i in c_hyper){
for(j in c_gamma){
c_error<-c()
for(k in 1:5){
set.seed(k)
mod_svm <- ksvm(DLY~., data=train_temp[[k]],
kernel="rbfdot", C=i, gamma=j)
pred <- predict(mod_svm, test_temp[[k]])
error <- mse(pred, test_temp[[k]]$DLY)
c_error <- c_error %>% append(error)
}
hyper <- hyper %>% append(i)
gamma <- gamma %>% append(j)
mmse <- mmse %>% append(mean(c_error))
}
}
table_svm <- data.frame(hyper, gamma, mmse) %>% arrange(mmse)
table_svm %>% head(5)4.4.4 eXtreme Gradient Boosting
XGBoost 모델은 현재 Kaggle에서 자주 사용되며 가장 인기 있는 모델 중 하나입니다.
모델을 사용하기 전에 사용되는 파라미터를 먼저 알아보았습니다.
- max_depth : 최대 깊이 노드
- nrounds : 부스팅 반복 수
- eta : 학습률
4.4.4.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
c_max_depth <- c(4, 6, 8)
c_nrounds <- c(50, 70, 100, 150, 200)
c_eta <- c(0.1, 0.12, 0.14)
max_depth<-c(); nrounds<-c(); eta<-c(); mmse<-c()
for(i in c_max_depth){
for(j in c_nrounds){
for(l in c_eta){
c_error<-c()
for(k in 1:5){
set.seed(k)
mod_xgb <- xgboost(data=as.matrix(train_temp[[k]] %>% select(-'DLY')),
label=as.numeric(train_temp[[k]]$DLY),
max_depth=i,nrounds=j,eta=l,
objective="binary:logistic",
early_stopping_rounds=10,
verbose=0)
pred <- predict(mod_xgb, as.matrix(test_temp[[k]] %>% select(-'DLY')))
error <- mse(pred, test_temp[[k]]$DLY)
c_error <- c_error %>% append(error)
}
max_depth <- max_depth %>% append(i)
nrounds <- nrounds %>% append(j)
eta <- eta %>% append(k)
mmse <- mmse %>% append(mean(c_error))
}
}
}
table_xgb <- data.frame(max_depth,nrounds,eta,mmse) %>% arrange(mmse)
table_xgb %>% head(5)4.4.5 Neural Network
인공 신경망 모델은 입력 신호와 출력 신호 사이의 관계를 학습하며
인간의 뇌가 활동하는 방법과 비슷하게 움직입니다.
- size : 은닉 계층의 뉴런 수
- maxit : 반복 횟수
4.4.5.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
c_size<-3:10
c_maxit<-c(100,200,300)
size<-c();maxit<-c();mmse<-c();
for(i in c_size){
for(j in c_maxit){
c_error<-c()
for(k in 1:5){
set.seed(k)
mod_nnet <- nnet(DLY~., data=train_temp[[k]],
size=i, decay=5e-04, maxit=j,
act.fct="logistic", type="prob", trace=F)
pred <- predict(mod_nnet, test_temp[[k]])
error <- mse(pred, test_temp[[k]]$DLY)
c_error <- append(c_error, error)
}
size<-append(size,i)
maxit<-append(maxit,j)
mmse<-append(mmse,mean(c_error))
}
}
table_nnet <- data.frame(size,maxit,mmse) %>% arrange(mmse)
table_nnet %>% head(5)5 Ensemble
5.1 학습법 내 앙상블
불균형 데이터를 배깅함에 있어서 로즈 샘플링을 적용하였습니다.
이는 10,000개의 로즈 표집을 적합한 모델 5개의 평균 예측값을
학습법의 결과 예측값으로 설정합니다.
5.1.0.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
list_rf<-list(); list_glm<-list(); list_svm<-list(); list_xgb<-list(); list_nnet<-list()
for (k in 1:5){
set.seed(k)
mod_rf <- randomForest(DLY~., data=train_actual[[k]], mtry=table_rf$mtry[1], ntree=500)
mod_glm_all <- glm(DLY~.+MIST:HM_INV, data=train_actual[[k]], family='binomial')
mod_glm_0 <- glm(DLY~1, data=train_actual[[k]], family='binomial')
mod_glm <- step(mod_glm_0, scope=list(lower=mod_glm_0, upper=mod_glm_all), direction='both', trace=0)
mod_svm <- ksvm(DLY~., data=train_actual[[k]], kernel="rbfdot", C=table_svm$hyper[1], gamma=table_svm$gamma[1])
mod_xgb <- xgboost(data=as.matrix(train_actual[[k]] %>% select(-DLY)), label=as.numeric(train_actual[[k]]$DLY),
max_depth=table_xgb$max_depth[1], nrounds=table_xgb$nrounds[1], eta=table_xgb$eta[1],
objective="binary:logistic", early_stopping_rounds=10, verbose=0)
mod_nnet <- nnet(DLY~., data=train_actual[[k]], size=table_nnet$size[1], decay=5e-04, maxit=table_nnet$maxit[1],
act.fct="logistic", type="prob", trace=F)
list_rf[[k]] <- mod_rf
list_glm[[k]] <- mod_glm
list_svm[[k]] <- mod_svm
list_xgb[[k]] <- mod_xgb
list_nnet[[k]] <- mod_nnet
}5.2 학습법 간 앙상블
각 모델의 가지고 있는 장점들은 살리고 단점은 줄이기위해 학습법 간에 앙상블을 시도했습니다.
5.2.0.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
pred_train <- function(line){
pred_rf<-c(); pred_glm<-c(); pred_svm<-c(); pred_xgb<-c(); pred_nnet<-c()
for (i in 1:5){
pred_rf <- pred_rf %>% append(predict(list_rf[[i]], line))
pred_glm <- pred_glm %>% append(predict(list_glm[[i]], line, type='response'))
pred_svm <- pred_svm %>% append(predict(list_svm[[i]], line))
pred_xgb <- pred_xgb %>% append(predict(list_xgb[[i]], as.matrix(line %>% select(-DLY))))
pred_nnet <- pred_nnet %>% append(predict(list_nnet[[i]], line))
}
result <- c(mean(pred_rf),mean(pred_glm),mean(pred_svm),mean(pred_xgb),mean(pred_nnet))
return(result)
}
pred_test <- function(line){
pred_rf<-c(); pred_glm<-c(); pred_svm<-c(); pred_xgb<-c(); pred_nnet<-c()
for (i in 1:5){
pred_rf <- pred_rf %>% append(predict(list_rf[[i]], line))
pred_glm <- pred_glm %>% append(predict(list_glm[[i]], line, type='response'))
pred_svm <- pred_svm %>% append(predict(list_svm[[i]], line))
pred_xgb <- pred_xgb %>% append(predict(list_xgb[[i]], as.matrix(line)))
pred_nnet <- pred_nnet %>% append(predict(list_nnet[[i]], line))
}
result <- c(mean(pred_rf),mean(pred_glm),mean(pred_svm),mean(pred_xgb),mean(pred_nnet))
return(result)
}5.3 Predict
5.3.1 Prediction about REG Test data
REG를 붙인 데이터들에 대해, 같은 REG를 가진 기체가 같은 날짜에 지연 되었다면,
그 다음 비행들도 A/C접속으로 지연이 될 확률이 높으므로 같은 REG를 가진 데이터끼리 묶어서 예측했습니다.
5.3.1.1 이 파트는 코드 실행 시 시간이 많이 소요되므로 eval=FALSE 를 통해 실행되지 않도록 설정했습니다. 실행을 원하시면 해당 문구를 삭제해주세요.
output_final <- data.frame()
for(i in 16:30){
testset <- test_final %>% filter(SDT_DD==i) %>% arrange(REG,STT) %>% select(-c(SDT_DD,STT))
reg_list <- unique(testset$REG)
output2 <- data.frame()
for(j in reg_list){
testset_REG <- testset %>% filter(REG==j)
testset_REG$PAST <- 0
idx <- testset_REG$INDEX
testset_REG <- testset_REG %>% select(-INDEX)
if(j!=''){
testset_REG <- testset_REG %>% select(-REG)
class <- c(); pred <- c()
nr_reg<-nrow(testset_REG)
for(k in 1:nr_reg){
preds <- pred_test(testset_REG[k,])
if(k==1){
pred_class <- ifelse(sum(ifelse(preds>=.5,1,0))>=3,1,0)
pred_prob <- mean(preds)
testset_REG$PAST[2] <- ifelse(pred_class==1,1,0)
}
else if(k>1 & k<nr_reg){
pred_class <- ifelse(sum(ifelse(preds>=.5,1,0))>=3,1,0)
pred_prob <- mean(preds)
testset_REG$PAST[k+1] <- ifelse(pred_class==1,1,0)
}
else{
pred_class <- ifelse(sum(ifelse(preds>=.5,1,0))>=3,1,0)
pred_prob <- mean(preds)
}
class <- append(class, pred_class)
pred <- append(pred, pred_prob)
}
output1 <- cbind(INDEX=idx, DLY=class, DLY_RATE=pred)
}
else{
testset_REG <- testset_REG %>% select(-REG)
class <- c(); pred <- c()
nr_reg<-nrow(testset_REG)
for(k in 1:nr_reg){
preds <- pred_test(testset_REG[k,])
pred_class <- ifelse(sum(ifelse(preds>=.5,1,0))>=3,1,0)
pred_prob <- mean(preds)
testset_REG$PAST[k] <- ifelse(pred_class==1,1,0)
class <- append(class, pred_class)
pred <- append(pred, pred_prob)
}
output1 <- cbind(INDEX=idx, DLY=class, DLY_RATE=pred)
}
output2 <- rbind(output2, output1)
}
output_final <- rbind(output_final, output2)
}
str(output_final)
shironmaro <- output_final %>% left_join(afsnt_dly, by='INDEX') %>%
select(SDT_YY,SDT_MM,SDT_DD,SDT_DY,ARP,ODP,FLO,FLT,AOD,STT,DLY,DLY_RATE,INDEX) %>%
arrange(INDEX) %>% select(-INDEX)
table(shironmaro$DLY)
head(shironmaro)
# write.csv(shironmaro,"shironmaro.csv",row.names=F)